library(here)
## here() starts at /Users/adelheid/Documents/MEDS/EDS_222/eds222_final_project
library(tidyverse)
## ── Attaching packages
## ───────────────────────────────────────
## tidyverse 1.3.2 ──
## ✔ ggplot2 3.4.0      ✔ purrr   0.3.5 
## ✔ tibble  3.1.8      ✔ dplyr   1.0.10
## ✔ tidyr   1.2.1      ✔ stringr 1.4.1 
## ✔ readr   2.1.3      ✔ forcats 0.5.2 
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag()    masks stats::lag()
library(sf)
## Linking to GEOS 3.11.0, GDAL 3.5.3, PROJ 9.1.0; sf_use_s2() is TRUE
library(tmap)

Background

Data Wrangling: All Species

Reading In Data

I used data from two sources: California Department of fish and wildlife and California Protected areas database.

salmon_populations <- read_csv(here("data/Salmonid_Population_Monitoring_Data_CMPv2021.csv"))
## New names:
## Rows: 3918 Columns: 26
## ── Column specification
## ──────────────────────────────────────────────────────── Delimiter: "," chr
## (18): Watershed, Population, Species, Life Stage, Origin, Run designatio... dbl
## (4): ID, CDFW region, GEO_ID_POLY, GEO_ID_PT num (3): Value, X95 lower CI, X95
## upper CI lgl (1): ...26
## ℹ Use `spec()` to retrieve the full column specification for this data. ℹ
## Specify the column types or set `show_col_types = FALSE` to quiet this message.
## • `` -> `...26`
watershed <- st_read("data/ds3001/ds3001.gdb")
## Multiple layers are present in data source /Users/adelheid/Documents/MEDS/EDS_222/eds222_final_project/data/ds3001/ds3001.gdb, reading layer `ds3001'.
## Use `st_layers' to list all layer names and their type in a data source.
## Set the `layer' argument in `st_read' to read a particular layer.
## Warning in evalq((function (..., call. = TRUE, immediate. = FALSE, noBreaks. =
## FALSE, : automatically selected the first layer in a data source containing more
## than one.
## Reading layer `ds3001' from data source 
##   `/Users/adelheid/Documents/MEDS/EDS_222/eds222_final_project/data/ds3001/ds3001.gdb' 
##   using driver `OpenFileGDB'
## Simple feature collection with 157 features and 5 fields
## Geometry type: MULTIPOLYGON
## Dimension:     XY
## Bounding box:  xmin: -370211.5 ymin: -445468.7 xmax: 179047.7 ymax: 465217.3
## Projected CRS: NAD83 / California Albers
protected_areas <- st_read("data/CPAD_2022a/CPAD_2022a_Holdings.dbf")
## Reading layer `CPAD_2022a_Holdings' from data source 
##   `/Users/adelheid/Documents/MEDS/EDS_222/eds222_final_project/data/CPAD_2022a/CPAD_2022a_Holdings.dbf' 
##   using driver `ESRI Shapefile'
## Simple feature collection with 94534 features and 35 fields
## Geometry type: MULTIPOLYGON
## Dimension:     XY
## Bounding box:  xmin: -374984.2 ymin: -604454.8 xmax: 540016.3 ymax: 449743.2
## Projected CRS: NAD83 / California Albers

Filtering to Adult Salmon Data

In this section I refine the data, for this study I am interested in looking at adult population counts, so I select data on Adults.

Not every watershed polygon has count data for it. I am taking out polygons with no adult data associated. There are 157 polygons of watersheds, I have adult counts for 110 of them.

spawning_data <- salmon_populations |> filter(`Life Stage` %in% "Adult") |> #all adult salmon
  select("Population", "Watershed", "Species", Brood_year = "Brood Year", "GEO_ID_POLY", "Value") |> filter(!is.na(GEO_ID_POLY)) #taking out data with no spatial id


watershed_id <- unique(spawning_data$GEO_ID_POLY) #making a list of all the watersheds that have adult population data 

watershed_new <- watershed |> filter(GEO_ID_POL %in% watershed_id)#filter to watersheds that have spawning data available


protected <- protected_areas |> select(UNIT_NAME, YR_EST) |> st_make_valid()
#selecting relevant columns

Filtering Protected Areas

protected2 <- protected |> filter(YR_EST < 2009| YR_EST %in% c(NA)) #remove after 2009

Calculating Percent Protected: Geo spatial Wrangling

intersect_polygons <- st_intersection(protected2, watershed) %>% 
   mutate(intersect_area = st_area(.)) %>%  # create new column with shape area
   dplyr::select(Name, GEO_ID_POL, intersect_area) #select columns
## Warning: attribute variables are assumed to be spatially constant throughout all
## geometries
#find total protected area by watershed
total_overlap <- intersect_polygons |> group_by(Name) |> summarize(n())|> 
  mutate(total_protected = st_area(geometry)) 

Mapping

tmap_mode("view")
## tmap mode set to interactive viewing
tmap_options(check.and.fix = TRUE)

tm_shape(total_overlap) + tm_fill(col = "green") + #map protected portions
  tm_shape(watershed_new) + tm_borders(col = "blue") #map watershed boundaries
## Warning: The shape total_overlap is invalid (after reprojection). See
## sf::st_is_valid
## Warning: The shape watershed_new is invalid (after reprojection). See
## sf::st_is_valid
# dropping geometry 
total_overlap_2 <- total_overlap |> st_drop_geometry()

watershed_area <- watershed_new |> mutate(total_area = st_area(Shape)) #find the total area of each watershed 

watershed_protected <- left_join(watershed_area, total_overlap_2, by = "Name") #add area protected column by joining

watershed_final <- watershed_protected |> mutate(percent_protected = as.numeric((total_protected)/ as.numeric(total_area)) *100) |> 
  mutate(percent_protected = round(percent_protected, digits = 0)) |> 
  mutate(percent_protected  = replace_na(percent_protected, 0))

#drop geometry and make it a data frame
watershed_geomless <- st_drop_geometry(watershed_final) |> as.data.frame() 

#all spawning observations with percent protected included 
all_data <- left_join(spawning_data, watershed_geomless, by = c("GEO_ID_POLY" = "GEO_ID_POL"))

Steelhead

Data Wrangling

steelhead <- all_data |> filter(Species == "Steelhead")


steelhead2 <- steelhead |>  group_by(Population, Brood_year) |> summarize(Value = max(Value), percent_protected = mean(percent_protected)) #taking the max estimation for years where estimate done in multiple ways 
## `summarise()` has grouped output by 'Population'. You can override using the
## `.groups` argument.
data_years <- steelhead2 |> group_by(Population) |> summarize(min_year = min(Brood_year), max_year = max(Brood_year), total_year = length(Brood_year)) #find what years I have data over

use_list <- data_years |> filter(min_year <= 2009) |> filter(max_year >= 2018)


list <- use_list$Population


steelhead_usable <- steelhead2 |> filter(Population %in% list) |> filter(Brood_year >= 2009) |> filter(Brood_year <= 2018)

#remove any populations which are zero over this whole time period
steelhead12 <- steelhead_usable |> group_by(Population) |> summarize(max = max(Value)) |> filter(max == 0)
remove <- steelhead12$Population


steelhead_usable <- steelhead_usable |> filter(!Population %in% remove)

remove_list <- steelhead_usable |> group_by(Population) |> summarize(min_year = min(Brood_year), max_year = max(Brood_year), total_year = length(Brood_year)) |> 
  filter(max_year != 2018 | min_year != 2009)

remove <- remove_list$Population


steelhead_usable <- steelhead_usable |> filter(!Population %in% remove)

Regression

\(population = B_0 + B_1year_t + B_2Year_t * Protected +E_i\)

steelhead_new <- steelhead_usable |>  mutate(year = as.numeric(Brood_year))

summary(lm(Value~year + year:percent_protected, data = steelhead_new))
## 
## Call:
## lm(formula = Value ~ year + year:percent_protected, data = steelhead_new)
## 
## Residuals:
##    Min     1Q Median     3Q    Max 
## -574.7 -480.8 -422.3 -111.0 8701.4 
## 
## Coefficients:
##                          Estimate Std. Error t value Pr(>|t|)
## (Intercept)            -2.533e+04  4.559e+04  -0.556    0.579
## year                    1.284e+01  2.264e+01   0.567    0.571
## year:percent_protected -1.286e-04  1.010e-03  -0.127    0.899
## 
## Residual standard error: 1192 on 328 degrees of freedom
## Multiple R-squared:  0.001025,   Adjusted R-squared:  -0.005066 
## F-statistic: 0.1683 on 2 and 328 DF,  p-value: 0.8452
#running a regression with all the steelhead data
steelhead_all <- all_data |> filter(Species == "Steelhead") |> mutate(year = as.numeric(Brood_year))

summary(lm(Value~year + year:percent_protected, data = steelhead_all))
## 
## Call:
## lm(formula = Value ~ year + year:percent_protected, data = steelhead_all)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
##  -538.4  -396.1  -301.1  -113.3 15848.0 
## 
## Coefficients:
##                          Estimate Std. Error t value Pr(>|t|)  
## (Intercept)            -2.294e+04  1.023e+04  -2.242   0.0251 *
## year                    1.162e+01  5.083e+00   2.286   0.0224 *
## year:percent_protected -4.291e-04  5.163e-04  -0.831   0.4060  
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 1151 on 1196 degrees of freedom
##   (1 observation deleted due to missingness)
## Multiple R-squared:  0.005664,   Adjusted R-squared:  0.004001 
## F-statistic: 3.407 on 2 and 1196 DF,  p-value: 0.03348

Assumptions of OLS

Linear in Parameters

It’s very difficult to tell if its linear in parameters

ggplot(data = steelhead_new, aes(x = year, y = Value)) + geom_point() + geom_smooth(method=lm, se = FALSE)
## `geom_smooth()` using formula = 'y ~ x'

X has variation

This is true

Assumption 4

These do not appear to be normally distributed.

model <- lm(Value~year + year:percent_protected, data = steelhead_new)
residuals <- model$residuals |> as.data.frame()


ggplot(data = residuals) + geom_histogram(aes(x = model$residuals))
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.

Coho

coho <- all_data |> filter(Species == "Coho salmon")

coho2 <- coho |>  group_by(Population, Brood_year) |> summarize(Value = max(Value), percent_protected = max(percent_protected)) #taking the max estimation for years where estimate done in multiple ways 
## `summarise()` has grouped output by 'Population'. You can override using the
## `.groups` argument.
data_years <- coho2 |> group_by(Population) |> summarize(min_year = min(Brood_year), max_year = max(Brood_year), total_year = length(Brood_year)) #find what years I have data over

use_list <- data_years |> filter(min_year <= 2009) |> filter(max_year >= 2018)

list <- use_list$Population


coho_usable <- coho2 |> filter(Population %in% list) |> filter(Brood_year >= 2009) |> filter(Brood_year <= 2018)

#remove any populations which are zero over this whole time period
coho12 <- coho_usable |> group_by(Population) |> summarize(max = max(Value)) |> filter(max == 0)
remove <- coho12$Population


coho_usable2 <- coho_usable |> filter(!Population %in% remove)

remove_list <- coho_usable |> group_by(Population) |> summarize(min_year = min(Brood_year), max_year = max(Brood_year), total_year = length(Brood_year)) |> 
  filter(max_year != 2018 | min_year != 2009)

remove <- remove_list$Population


coho_final <- coho_usable |> filter(!Population %in% remove)

ggplot(data = coho_final) + geom_point(aes(x = Brood_year, y = Value))

coho_final <- coho_final |>  mutate(year = as.numeric(Brood_year))

summary(lm(Value~year + year:percent_protected, data = coho_final))
## 
## Call:
## lm(formula = Value ~ year + year:percent_protected, data = coho_final)
## 
## Residuals:
##    Min     1Q Median     3Q    Max 
## -552.6 -326.7 -201.8  -12.1 4632.9 
## 
## Coefficients:
##                          Estimate Std. Error t value Pr(>|t|)  
## (Intercept)            -5.072e+04  3.604e+04  -1.407   0.1610  
## year                    2.544e+01  1.790e+01   1.421   0.1570  
## year:percent_protected -1.401e-03  7.219e-04  -1.940   0.0538 .
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 705.9 on 185 degrees of freedom
## Multiple R-squared:  0.03007,    Adjusted R-squared:  0.01958 
## F-statistic: 2.867 on 2 and 185 DF,  p-value: 0.05937
acf(coho_final$Value, lag.max = 10)

Limitations and Issues

There were a lot of NA s in the year protected roughly 32 % of the polygons I used The definition of protected area was very loose. The data was not consistent over time, so I only used data from 2008-2018. Some watersheds within the study have hatchery releases which are not consistent over time, this can wildly affect the population from year to year.